library(ggplot2)
library(ggplotify)
library(ggsignif)
library(Cairo)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(patchwork)
library(factoextra)
library(parallel)
library(uwot)
library(WGCNA)
library(ggsci)
library(topGO)
library(GO.db)
library(org.Hs.eg.db)
library(parallel)
library(doParallel)
library(biomaRt)
library(openxlsx)
registerDoParallel(makeCluster(12))
register(MulticoreParam(12))
options(gsubfn.engine = "R")
set.seed(12345)
source("~/utils/R/pretty_MA_plot.R")
source("~/utils/R/k_means_figure.R")
source("~/utils/R/plotPCA_manual.R")
fig2_pal = pal_npg("nrc")(9)
sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server 7.5 (Maipo)
Matrix products: default
BLAS: /hpc/software/lapack/3.6.0_gcc/lib64/libblas.so.3.6.0
LAPACK: /hpc/software/lapack/3.6.0_gcc/lib64/liblapack.so.3.6.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] openxlsx_4.1.5 ggsignif_0.6.0 GenomicAlignments_1.22.1 Rsamtools_2.2.3
[5] Biostrings_2.54.0 XVector_0.26.0 pryr_0.1.4 rasterize_0.1
[9] sp_1.4-2 ggrastr_0.1.9 Cairo_1.5-12 ggplotify_0.0.5
[13] RPushbullet_0.3.3 ggrepel_0.8.2 gtools_3.8.2 biomaRt_2.42.1
[17] corrplot_0.84 ggforce_0.3.2 googledrive_1.0.1 googlesheets4_0.2.0
[21] spgs_1.0-3 readxl_1.3.1 rtracklayer_1.46.0 forcats_0.5.0
[25] stringr_1.4.0 dplyr_1.0.0 purrr_0.3.4 readr_1.3.1
[29] tidyr_1.1.0 tibble_3.0.1 tidyverse_1.3.0 doParallel_1.0.15
[33] iterators_1.0.12 foreach_1.5.0 org.Hs.eg.db_3.10.0 topGO_2.38.1
[37] SparseM_1.78 GO.db_3.10.0 AnnotationDbi_1.48.0 graph_1.64.0
[41] ggsci_2.9 WGCNA_1.69 fastcluster_1.1.25 dynamicTreeCut_1.63-1
[45] uwot_0.1.8 Matrix_1.2-18 factoextra_1.0.7 patchwork_1.0.1
[49] viridis_0.5.1 viridisLite_0.3.0 RColorBrewer_1.1-2 pheatmap_1.0.12
[53] DESeq2_1.26.0 SummarizedExperiment_1.16.1 DelayedArray_0.12.3 BiocParallel_1.20.1
[57] matrixStats_0.56.0 Biobase_2.48.0 GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
[61] IRanges_2.20.2 S4Vectors_0.24.4 BiocGenerics_0.34.0 ggplot2_3.3.2
loaded via a namespace (and not attached):
[1] backports_1.1.8 Hmisc_4.4-0 BiocFileCache_1.10.2 splines_3.6.3 digest_0.6.25
[6] htmltools_0.5.0 fansi_0.4.1 magrittr_1.5 checkmate_2.0.0 memoise_1.1.0
[11] cluster_2.1.0 annotate_1.64.0 modelr_0.1.8 askpass_1.1 prettyunits_1.1.1
[16] jpeg_0.1-8.1 colorspace_1.4-1 rappdirs_0.3.1 blob_1.2.1 rvest_0.3.5
[21] haven_2.3.1 xfun_0.15 crayon_1.3.4 RCurl_1.98-1.2 jsonlite_1.7.0
[26] genefilter_1.68.0 impute_1.60.0 survival_3.1-8 glue_1.4.1 polyclip_1.10-0
[31] gtable_0.3.0 zlibbioc_1.32.0 scales_1.1.1 DBI_1.1.0 Rcpp_1.0.4.6
[36] progress_1.2.2 xtable_1.8-4 htmlTable_2.0.0 gridGraphics_0.5-0 foreign_0.8-75
[41] bit_1.1-15.2 preprocessCore_1.48.0 Formula_1.2-3 htmlwidgets_1.5.1 httr_1.4.1
[46] acepack_1.4.1 ellipsis_0.3.1 farver_2.0.3 pkgconfig_2.0.3 XML_3.99-0.3
[51] nnet_7.3-12 dbplyr_1.4.4 locfit_1.5-9.4 tidyselect_1.1.0 rlang_0.4.6
[56] munsell_0.5.0 cellranger_1.1.0 tools_3.6.3 cli_2.0.2 generics_0.0.2
[61] RSQLite_2.2.0 broom_0.5.6 knitr_1.29 bit64_0.9-7 fs_1.4.2
[66] zip_2.0.4 nlme_3.1-144 xml2_1.3.2 compiler_3.6.3 rstudioapi_0.11
[71] beeswarm_0.2.3 curl_4.3 png_0.1-7 reprex_0.3.0 tweenr_1.0.1
[76] geneplotter_1.64.0 stringi_1.4.6 lattice_0.20-38 vctrs_0.3.1 pillar_1.4.4
[81] lifecycle_0.2.0 BiocManager_1.30.10 data.table_1.12.8 bitops_1.0-6 R6_2.4.1
[86] latticeExtra_0.6-29 gridExtra_2.3 vipor_0.4.5 codetools_0.2-16 MASS_7.3-51.5
[91] assertthat_0.2.1 openssl_1.4.2 withr_2.2.0 GenomeInfoDbData_1.2.2 hms_0.5.3
[96] rpart_4.1-15 rvcheck_0.1.8 lubridate_1.7.9 base64enc_0.1-3 ggbeeswarm_0.6.0
mp = readRDS("/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_script_bulk_des_no_hURNA.rds")
mp$covid_confirmed = factor(mp$covid_confirmed)
mp$sex = factor(mp$gender)
#set up for comparison by diagnosis
mp$Diagnosis = gsub("-| ", "_", as.character(mp$pna_type))
mp$Diagnosis = factor(mp$Diagnosis,
levels = c("Healthy_Control", "Non_Pneumonia_Control", "COVID_19",
"Other_Viral_Pneumonia", "Other_Pneumonia"))
mp$neutrophilic = mp$percent_neutrophils > 50
#for modeling, etc
mp$finite_day_of_intubation = mp$day_of_intubation
mp$finite_day_of_intubation[is.infinite(mp$finite_day_of_intubation)] = NA
mp$day0_sample = factor(mp$day_of_intubation >= 0 & mp$day_of_intubation <=2)
mp$day0_sample[is.na(mp$day0_sample)] = FALSE
design(mp) = as.formula("~ Diagnosis")
metadata = as.data.frame(colData(mp))
table(mp$pna_type, useNA = "always")
Healthy Control Non-Pneumonia Control COVID-19 Other Viral Pneumonia Other Pneumonia
5 24 82 29 103
<NA>
0
bulk_patients = colData(mp) %>%
as.data.frame() %>%
dplyr::select(study_id, covid_confirmed, pna_type) %>%
unique()
table(bulk_patients$covid_confirmed)
FALSE TRUE
135 51
table(bulk_patients$pna_type)
Healthy Control Non-Pneumonia Control COVID-19 Other Viral Pneumonia Other Pneumonia
5 23 51 23 84
n_samples = table(metadata$patient_id)
serial_patients = names(n_samples[n_samples > 1])
length(serial_patients)
[1] 41
ggplot(metadata, aes(x = day_of_intubation)) +
geom_histogram(fill = "dodgerblue4")
mp_vst = vst(mp, nsub = 3000, fitType = "local") #know from later that local is best
mp_counts = counts(estimateSizeFactors(mp), normalized = T)
pca_data = plotPCA_manual(mp_vst,
intgroup = c("covid_confirmed"),
pcs = 10)
pca_data$data = left_join(pca_data$data,
as.data.frame(colData(mp)),
by = c("name" = "sample", "covid_confirmed"))
#merge counts and pca data
gene_conv = as.data.frame(rowData(mp)) %>%
rownames_to_column(var = "ensembl_gene_id")
merge_counts = mp_counts %>%
t() %>%
as.data.frame() %>%
rownames_to_column(var = "name")
#merge together for featureplot dataset
fp_data = left_join(pca_data$data, merge_counts, by = "name")
#scree plot
fviz_eig(pca_data$pca,
barfill = "dodgerblue4",
xlab = "Principal Component",
ylab = "Percent Variance Explained",
main = "",
ncp = 50,
ggtheme = theme_bw(),
addlabels = T) #4-5 PCs will do it
loadings = pca_data$pca$rotation %>%
as.data.frame() %>%
rownames_to_column(var = "ensembl_gene_id") %>%
right_join(gene_conv, .)
Joining, by = "ensembl_gene_id"
ggplot(fp_data, aes(x = PC1, y = PC2, shape = covid_confirmed, color = percent_total_CD206_high)) +
geom_point(size = 3)
fabp4_plot = ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(ENSG00000170323))) +
geom_point() +
scale_color_viridis(name = "Log2(FABP4 Counts)")
spp1_plot = ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(ENSG00000118785))) +
geom_point() +
scale_color_viridis(name = "Log2(SPP1 Counts)")
fabp4_plot + spp1_plot
ggplot(fp_data, aes(x = PC1, y = PC2, shape = covid_confirmed, color = C_Reactive_Protein)) +
geom_point(size = 3)
ggplot(fp_data, aes(x = PC1, y = PC2, shape = covid_confirmed, color = LDH)) +
geom_point(size = 3)
ggplot(fp_data, aes(x = PC1, y = PC2, shape = covid_confirmed, color = FERRITIN)) +
geom_point(size = 3)
inhba_plot = ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(ENSG00000122641))) +
geom_point() +
scale_color_viridis(name = "Log2(INHBA)")
il1b_plot = ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(ENSG00000125538))) +
geom_point() +
scale_color_viridis(name = "Log2(IL1b)")
After adding healthy controls this is longer true.
ggplot(fp_data, aes(x = PC2, y = PC3, color = gender)) +
geom_point(size = 3)
xist_plot = ggplot(fp_data, aes(x = PC2, y = PC3, color = log2(ENSG00000229807))) +
geom_point() +
scale_color_viridis(name = "Log2(XIST)")
Not really
ggplot(fp_data, aes(x = PC1, y = PC2, color = Diagnosis)) +
geom_point(size = 2) +
stat_ellipse()
ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(day_of_intubation))) +
geom_point(size = 3)
No clear separation.
#note: DESeq is refitting far too many genes. Need to turn this off.
mp_des_parametric = DESeq(mp,
fitType = "parametric",
parallel = T,
minReplicatesForReplace = Inf)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 12 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 12 workers
plotDispEsts(mp_des_parametric, cex = 0.1)
This fit really isn’t bad, but overestimates at lower expression (probably not a huge deal).
mp_des_local = DESeq(mp,
fitType = "local",
parallel = T,
minReplicatesForReplace=Inf)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 12 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 12 workers
plotDispEsts(mp_des_local, cex = 0.1)
mp_des = mp_des_local
rm(mp_des_local, mp_des_parametric)
Still not perfect, but a lot better.
#used a few times later; safest to put here
#identify infected samples
non_human_genes = subset(gene_conv, !(grepl("^ENSG", ensembl_gene_id))) #human genes all have this prefix
non_human_genes$gene_name[non_human_genes$ensembl_gene_id == "gene-UJ99_s6gp1"] = "NA" #neuraminidase was misread
non_human_genes$virus = factor(ifelse(grepl("UJ99", non_human_genes$ensembl_gene_id),
yes = "Influenza A/California/07/2009",
no = "SARS-CoV-2"))
cov2_genes = subset(non_human_genes, grepl("SARS|GU280", ensembl_gene_id))
iav_genes = subset(non_human_genes, grepl("UJ99", ensembl_gene_id))
flu_counts = counts(mp_des, normalized = T)[iav_genes$ensembl_gene_id, ]
flu_detected = colnames(flu_counts[, colSums(flu_counts) > 0])
cov2_counts = counts(mp_des, normalized = T)[cov2_genes$ensembl_gene_id, ]
cov2_detected = colnames(cov2_counts[, colSums(cov2_counts) > 0])
mp_des$sars_cov2_detected = factor(mp_des$sample %in% cov2_detected)
mp_des$iav_h1n1_detected = factor(mp_des$sample %in% flu_detected)
## Replicating CoV2 samples
### overall
antisense_counts = cov2_counts["SARS_CoV_2_antisense_genome", ]
sense_counts = cov2_counts[2:nrow(cov2_counts), ]
replicating_cov2 = intersect(names(antisense_counts)[antisense_counts > 0],
colnames(sense_counts[, colSums(sense_counts) > 0]))
### just covid
percent_detected_covid = length(unique(subset(metadata, sample %in% cov2_detected & covid_confirmed == TRUE)$study_id)) / length(unique(subset(metadata, covid_confirmed == TRUE)$study_id)) * 100
percent_detected_covid
[1] 66.66667
percent_replicating_covid = length(unique(subset(metadata, sample %in% replicating_cov2 & covid_confirmed == TRUE)$study_id)) / length(unique(subset(metadata, covid_confirmed == TRUE)$study_id)) * 100
percent_replicating_covid
[1] 25.4902
percent_replicating_covid / percent_detected_covid * 100
[1] 38.23529
Does expression of viral genes correlate with covid diagnosis?
cov2_counts = lapply(cov2_genes$ensembl_gene_id, function(x){
counts = plotCounts(mp,
gene = x,
intgroup = "Diagnosis",
returnData = T,
pc = T)
counts$gene = x
return(counts)})
cov2_counts = bind_rows(cov2_counts)
ggplot(cov2_counts, aes(x = Diagnosis, y = count, fill = gene)) +
geom_boxplot() +
scale_y_log10()
Now that we have all of the necessary metadata, this looks fantastic.
viral_counts = counts(mp_des, normalized = T)
viral_counts = viral_counts[non_human_genes$ensembl_gene_id, ]
viral_counts = log10(viral_counts + 0.5)
virus_detection = vector(mode = "character", length = ncol(mp_des))
for(i in 1:ncol(mp_des))
{
has_covid = mp_des$covid_confirmed[i] == TRUE
has_iav_h1n1 = mp_des$INFLUENZA_A_H1_2009[i] == "Positive"
has_other_coronavirus = mp_des$any_coronavirus_non_covid[i] == TRUE
#recode NA as FALSE for safety
if(is.na(has_iav_h1n1))
{
has_iav_h1n1 = FALSE
}
if(is.na(has_other_coronavirus))
{
has_other_coronavirus = FALSE
}
if(has_covid == TRUE && has_iav_h1n1 == TRUE && has_other_coronavirus == TRUE)
{
virus_detection[i] = "SARS-CoV-2 & Other Coronavirus & Influenza A/California/07/2009"
} else if(has_covid == TRUE && has_iav_h1n1 == FALSE && has_other_coronavirus == FALSE)
{
virus_detection[i] = "SARS-CoV-2"
} else if(has_covid == FALSE && has_iav_h1n1 == TRUE && has_other_coronavirus == FALSE)
{
virus_detection[i] = "Influenza A/California/07/2009"
} else if(has_covid == FALSE && has_iav_h1n1 == FALSE && has_other_coronavirus == TRUE)
{
virus_detection[i] = "Other Coronavirus"
} else if(has_covid == TRUE && has_iav_h1n1 == TRUE && has_other_coronavirus == FALSE)
{
virus_detection[i] = "SARS-CoV-2 & Influenza A/California/07/2009"
} else if(has_covid == TRUE && has_iav_h1n1 == FALSE && has_other_coronavirus == TRUE)
{
virus_detection[i] = "SARS-CoV-2 & Other Coronavirus"
} else if(has_covid == FALSE && has_iav_h1n1 == TRUE && has_other_coronavirus == TRUE)
{
virus_detection[i] = "Other Coronavirus & Influenza A/California/07/2009"
} else
{
virus_detection[i] = NA
}
}
mp_des$detected_viruses = factor(virus_detection)
diagnosis_annotation = as.data.frame(colData(mp_des)) %>%
dplyr::select(Diagnosis = detected_viruses)
gene_annotation = non_human_genes %>%
remove_rownames() %>%
column_to_rownames("ensembl_gene_id") %>%
dplyr::select(`Viral Origin` = virus)
gene_names = non_human_genes$gene_name
gene_names[gene_names == "SARS_CoV_2_antisense_genome"] = "Antisense"
viral_expression_heatmap = pheatmap(viral_counts,
clustering_method = "ward.D2",
show_colnames = F,
color = inferno(100),
annotation_col = diagnosis_annotation,
annotation_row = gene_annotation,
labels_row = gene_names,
annotation_colors = list(Diagnosis = c("SARS-CoV-2" = fig2_pal[1], "Other Coronavirus" = fig2_pal[2],
"Influenza A/California/07/2009" = fig2_pal[3], "Other Coronavirus & Influenza A/California/07/2009" = fig2_pal[4]),
`Viral Origin` = c("SARS-CoV-2" = fig2_pal[1], "Influenza A/California/07/2009" = fig2_pal[3])),
angle_col = 45,
annotation_names_row = F,
fontsize = 24)
This patient apparently met criteria for COVID well before the first known cases in Chicago
cov2_counts = counts(mp_des, normalized = T)[cov2_genes$ensembl_gene_id, ]
samples_1174 = mp_des$study_id == "1174"
samples_1174[is.na(samples_1174)] = FALSE
tubes_1174 = mp_des$sample[samples_1174]
cov2_counts_1174 = cov2_counts[, tubes_1174]
cov2_counts_1174
SARS_CoV_2_antisense_genome gene-GU280_gp01 gene-GU280_gp02 gene-GU280_gp03
0 0 0 0
gene-GU280_gp04 gene-GU280_gp05 gene-GU280_gp06 gene-GU280_gp07
0 0 0 0
gene-GU280_gp08 gene-GU280_gp09 gene-GU280_gp10 gene-GU280_gp11
0 0 0 0
No detection, at least in macrophages.
Checking for viral clearance
covid_cases = mp_des[, mp_des$covid_confirmed == TRUE]
covid_counts = counts(covid_cases, normalized = T)
covid_counts = covid_counts[cov2_genes$ensembl_gene_id, ]
covid_counts = covid_counts %>%
as.data.frame() %>%
rownames_to_column(var = "gene") %>%
pivot_longer(cols = "340195046":"340239867",
names_to = "sample",
values_to = "Counts") %>%
left_join(., non_human_genes, by = c("gene" = "ensembl_gene_id")) %>%
dplyr::select(-c(gene, virus)) %>%
left_join(., as.data.frame(colData(mp_des)), by = "sample")
ggplot(covid_counts, aes(x = day_of_intubation, y = Counts)) +
facet_wrap(~ gene_name, scales = "free_y") +
geom_point() +
geom_smooth(se = F)
Unfortunately this is somewhat binarized. Better to treat it as such.
time_cor
Spearman's rank correlation rho
data: covid_counts_binarized$all_viral_counts and covid_counts_binarized$day_of_intubation
S = 35853, p-value = 0.0008307
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.445437
col_types = unlist(lapply(metadata, class))
list_cols = names(col_types[col_types == "AsIs"])
safe_metadata = metadata %>%
dplyr::select(-c(all_of(list_cols))) %>%
as.data.frame()
write.csv(safe_metadata, "/projects/b1038/Pulmonary/Workspace/COVID19_bal_ms/200709_bulk_metadata.csv")
write.csv(counts(mp_des, normalized = F), "/projects/b1038/Pulmonary/Workspace/COVID19_bal_ms/200709_bulk_counts_raw.csv")
write.csv(counts(mp_des, normalized = T), "/projects/b1038/Pulmonary/Workspace/COVID19_bal_ms/200709_bulk_counts_deseq2_normalized.csv")
good_samples = mp_des[, ((!is.na(mp_des$RIN) & mp_des$RIN >= 7) &
mp_des$STAR_mqc_generalstats_star_uniquely_mapped_percent >= 30 &
mp_des$featureCounts_mqc_generalstats_featurecounts_percent_assigned >= 30) &
mp_des$RNA_concentration_pg_ul >= 125 |
(mp_des$iav_h1n1_detected == TRUE | mp_des$sars_cov2_detected == TRUE)]
selection_data = as.data.frame(colData(good_samples)) %>%
dplyr::select(sample, tc_pt_study_id, RIN, STAR_mqc_generalstats_star_uniquely_mapped_percent,
featureCounts_mqc_generalstats_featurecounts_percent_assigned, iav_h1n1_detected,
sars_cov2_detected, covid_confirmed, batch, RNA_concentration_pg_ul, pna_type)
selection_data
Keepers:
- 340224808 (IAV) - 304017553 (IAV + likely another coronavirus) - 340239805 (Cov2) - 340233896 (Cov2) - 340239790 (Cov2) - 340239789 (Cov2) - 304012699 (Other – likely another coronavirus) - 300312389 (other) - 304020362 (other)
keepers = selection_data %>%
dplyr::filter(sample %in% c(304020298, 340224808, 304017553, 340239805, 340233896,
340239790, 340239789, 304012699, 300312389, 304020362)) %>%
mutate(vol_for_250_pg = round(250 / RNA_concentration_pg_ul, digits = 3))
keepers$dilution = ifelse(keepers$vol_for_250_pg > 0.5,
yes = 1,
no = ifelse(keepers$vol_for_250_pg > 0.1,
yes = 5,
no = 20))
keepers$dilution_vol_for_250_pg = round(250 / (keepers$RNA_concentration_pg_ul / keepers$dilution), digits = 3)
write.csv(keepers, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200612_resequencing_samples.csv")
keepers
il6_counts = plotCounts(mp_des,
gene = "ENSG00000136244",
normalized = T,
intgroup = "covid_confirmed",
returnData = T) %>%
rownames_to_column(var = "sample") %>%
left_join(.,
metadata,
by = c("sample", "covid_confirmed"))
cor.test(il6_counts$count, il6_counts$percent_total_CD206_high)
Pearson's product-moment correlation
data: il6_counts$count and il6_counts$percent_total_CD206_high
t = -1.967, df = 214, p-value = 0.05047
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.2620931112 0.0002344531
sample estimates:
cor
-0.1332627
il6_plot = ggplot(il6_counts, aes(x = percent_total_CD206_high, y = count, color = covid_confirmed)) +
geom_point(size = 3) +
scale_y_log10() +
ylab("IL6 Counts")
Later confirmed NSD by group. Clearly correlates with MoAM character.
il1b_counts = plotCounts(mp_des,
gene = "ENSG00000125538",
normalized = T,
intgroup = "covid_confirmed",
returnData = T) %>%
rownames_to_column(var = "sample") %>%
left_join(.,
metadata,
by = c("sample", "covid_confirmed"))
cor.test(il1b_counts$count, il1b_counts$percent_total_CD206_high)
Pearson's product-moment correlation
data: il1b_counts$count and il1b_counts$percent_total_CD206_high
t = -3.8972, df = 214, p-value = 0.0001302
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.3779331 -0.1283454
sample estimates:
cor
-0.2574278
il1b_plot = ggplot(il1b_counts, aes(x = percent_total_CD206_high, y = count, color = covid_confirmed)) +
geom_point(size = 3) +
scale_y_log10() +
ylab("IL1b Counts")
il6_plot / il1b_plot
Later confirmed significantly downregulated, actually.
Versus healthy controls
cov2_vs_hc_results = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", "COVID_19", "Healthy_Control"),
alpha = 0.05,
parallel = T))
cov2_vs_hc_results_ids = cov2_vs_hc_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id)) %>%
arrange(log2FoldChange)
Joining, by = "ensembl_gene_id"
write.csv(cov2_vs_hc_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200717_standard_DEA_covid_over_healthycontrols.csv")
cov2_vs_hc_results_ids
Upregulation of CoV-2 genes is gorgeous
Versus non-pna (sick) controls
cov2_vs_npc_results = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", "COVID_19", "Non_Pneumonia_Control"),
alpha = 0.05,
parallel = T))
cov2_vs_npc_results_ids = cov2_vs_npc_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id)) %>%
arrange(log2FoldChange)
Joining, by = "ensembl_gene_id"
write.csv(cov2_vs_npc_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200717_standard_DEA_covid_over_nonpneumoniacontrols.csv")
cov2_vs_npc_results_ids
Versus other viral PNA
cov2_vs_ovp_results = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", "COVID_19", "Other_Viral_Pneumonia"),
alpha = 0.05,
parallel = T))
cov2_vs_ovp_results_ids = cov2_vs_ovp_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id)) %>%
arrange(log2FoldChange)
Joining, by = "ensembl_gene_id"
cov2_vs_ovp_results_ids
write.csv(cov2_vs_ovp_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200717_standard_DEA_covid_over_otherviralpneumonia.csv")
Versus other PNA
cov2_vs_other_pna_results = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", "COVID_19", "Other_Pneumonia"),
alpha = 0.05,
parallel = T))
cov2_vs_other_pna_results_ids = cov2_vs_other_pna_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id)) %>%
arrange(log2FoldChange)
Joining, by = "ensembl_gene_id"
write.csv(cov2_vs_other_pna_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200717_standard_DEA_covid_over_otherpneumonia.csv")
cov2_vs_other_pna_results_ids
pretty_MA_plot(cov2_vs_ovp_results, custom_annotation = gene_conv)
How many clusters does it take?
elbow = k_elbow(dge = mp_des, design = "~Diagnosis",
cores = 12,
minReps = Inf,
random_seed = 12345,
max_k = 25,
qval_cutoff = 0.01)
elbow
5 seems fair.
intubation_pal = colorRampPalette(brewer.pal(n = 9, name = "Purples")[3:9])(max(mp_des$finite_day_of_intubation, na.rm = T) + 1)
intubation_pal[1:3] = rep("#000000",3) #highlight "D0"
cov2_kmeans_noclustering = k_means_figure(dge = mp_des,
design = "~Diagnosis",
k = 5,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("Diagnosis", "finite_day_of_intubation"),
breaks = seq(-2, 2, length.out=101),
cores = 12,
minReps = Inf,
return_genes = T,
display_go_terms = F,
return_go_terms = F,
cluster_columns = F,
show_rownames = F,
baseMeanCutoff = 20,
random_seed = 12345,
qval_cutoff = 0.05,
annotation_colors = list(Diagnosis = c("Healthy_Control" = fig2_pal[6],
"Non_Pneumonia_Control" = fig2_pal[2],
"COVID_19" = fig2_pal[1],
"Other_Viral_Pneumonia" = fig2_pal[3],
"Other_Pneumonia" = fig2_pal[4]),
finite_day_of_intubation = intubation_pal),
custom_order = c(3, 5, 1, 2, 4),
sortColumns = T,
column_sort_factors = c("Diagnosis", "finite_day_of_intubation"))
cov2_kmeans_clustered = k_means_figure(dge = mp_des,
design = "~Diagnosis",
k = 5,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("Diagnosis", "finite_day_of_intubation"),
breaks = seq(-2, 2, length.out=101),
cores = 12,
minReps = Inf,
return_genes = T,
display_go_terms = F,
return_go_terms = T,
cluster_columns = T,
show_rownames = F,
baseMeanCutoff = 20,
random_seed = 12345,
qval_cutoff = 0.05,
annotation_colors = list(Diagnosis = c("Healthy_Control" = fig2_pal[6],
"Non_Pneumonia_Control" = fig2_pal[2],
"COVID_19" = fig2_pal[1],
"Other_Viral_Pneumonia" = fig2_pal[3],
"Other_Pneumonia" = fig2_pal[4]),
finite_day_of_intubation = intubation_pal),
custom_order = c(3, 5, 1, 2, 4))
for(i in 1:length(cov2_kmeans_clustered$GO))
{
cov2_kmeans_clustered$GO[[i]]$cluster = i
}
cov2_kmeans_clustered$go_df = bind_rows(cov2_kmeans_clustered$GO) %>%
arrange(cluster, padj)
cov2_kmeans_clustered$expression = counts(mp_des, normalized = T) %>%
as.data.frame() %>%
rownames_to_column("ensembl_gene_id") %>%
dplyr::filter(ensembl_gene_id %in% cov2_kmeans_clustered$genes$gene) %>%
right_join(gene_conv, .)
saveRDS(cov2_kmeans_clustered, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200802_k_means_clustered.rds")
saveRDS(cov2_kmeans_noclustering, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_k_means_unclustered.rds")
write.csv(cov2_kmeans_clustered$genes, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_k_means_genes.csv")
write.csv(cov2_kmeans_clustered$go_df, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_k_means_GO.csv")
This is actually very cool (and nicely foreshadows the later WGCNA results). C1: mostly COVID-related(!); highly enriched for type I interferon response (not production so much), as well as MHC-I. C2: mostly nonviral pneumonias. Pretty classic STAT3 inflammatory response.
d0_des = mp_des[, !is.na(mp_des$day_of_intubation) & mp_des$day_of_intubation <= 2 & mp_des$day_of_intubation >= 0]
# cov2_d0_kmeans_noclustering = k_means_figure(dge = d0_des,
# design = "~Diagnosis",
# k = 3,
# go_annotations = "org.Hs.eg.db",
# ensembl_db = "hsapiens_gene_ensembl",
# legend_factors = c("Diagnosis", "percent_total_CD206_high"),
# breaks = seq(-3, 3, length.out=101),
# cores = 12,
# minReps = Inf,
# return_genes = T,
# display_go_terms = F,
# return_go_terms = T,
# cluster_columns = F,
# label_fontsize = 2,
# customAnno = gene_conv,
# annoJoinCol = "ensembl_gene_id",
# sortColumns = T,
# colSortFactor = "Diagnosis",
# baseMeanCutoff = 20)
cov2_d0_kmeans_clustered = k_means_figure(dge = d0_des,
design = "~Diagnosis",
k = 3,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("Diagnosis", "percent_total_CD206_high"),
breaks = seq(-3, 3, length.out=101),
cores = 12,
minReps = Inf,
return_genes = T,
display_go_terms = F,
return_go_terms = T,
cluster_columns = T,
label_fontsize = 2,
customAnno = gene_conv,
annoJoinCol = "ensembl_gene_id",
baseMeanCutoff = 20)
This was an interesting COVID-specific hit. According to published data, it is a highly selective chemoattractant for T cells. Would expect that it should correlate with T cell abundance.
ccl24_counts = plotCounts(mp_des,
gene = "ENSG00000106178",
intgroup = "Diagnosis",
normalized = T,
pc = T,
returnData = T)
ggplot(ccl24_counts, aes(x = Diagnosis, y = count, fill = Diagnosis)) +
geom_boxplot(width = 0.5, outlier.shape = NA) +
geom_jitter(size = 0.1, width = 0.25) +
scale_y_continuous(trans = "log10") +
scale_fill_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non_Pneumonia_Control" = fig2_pal[2],
"COVID_19" = fig2_pal[1],
"Other_Viral_Pneumonia" = fig2_pal[3],
"Other_Pneumonia" = fig2_pal[4])) +
scale_x_discrete(labels = c("Healthy_Control" = "Healthy\nControl",
"Non_Pneumonia_Control" = "Non-Pneumonia\nControl",
"COVID_19" = "COVID-19",
"Other_Viral_Pneumonia" = "Other Viral\nPneumonia",
"Other_Pneumonia" = "Other\nPneumonia")) +
xlab("") +
ylab("CCL24 Counts") +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 32, family = "Arial"),
plot.title = element_text(hjust = 0.5))
ccl24_cd4 = plotCounts(mp_des,
gene = "ENSG00000106178",
intgroup = "percent_CD4_total",
normalized = T,
returnData = T) %>%
rownames_to_column("sample")
ccl24_cd8 = plotCounts(mp_des,
gene = "ENSG00000106178",
intgroup = "percent_CD8_total",
normalized = T,
returnData = T) %>%
rownames_to_column("sample")
ccl24_tcells = left_join(ccl24_cd4, ccl24_cd8) %>%
pivot_longer(names_to = "flow_parameter",
values_to = "percent",
cols = c(percent_CD8_total, percent_CD4_total))
Joining, by = c("sample", "count")
ggplot(ccl24_tcells, aes(x = count, y = percent)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~flow_parameter) +
scale_x_log10()
Pretty poor correlation
il6_counts = plotCounts(mp_des,
gene = "ENSG00000136244",
intgroup = "Diagnosis",
normalized = T,
pc = T,
returnData = T)
ggplot(il6_counts, aes(x = Diagnosis, y = count, fill = Diagnosis)) +
geom_boxplot(width = 0.5, outlier.shape = NA) +
geom_jitter(size = 0.1, width = 0.25) +
scale_y_continuous(trans = "log10") +
scale_fill_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non_Pneumonia_Control" = fig2_pal[2],
"COVID_19" = fig2_pal[1],
"Other_Viral_Pneumonia" = fig2_pal[3],
"Other_Pneumonia" = fig2_pal[4])) +
scale_x_discrete(labels = c("Healthy_Control" = "Healthy\nControl",
"Non_Pneumonia_Control" = "Non-Pneumonia\nControl",
"COVID_19" = "COVID-19",
"Other_Viral_Pneumonia" = "Other Viral\nPneumonia",
"Other_Pneumonia" = "Other\nPneumonia")) +
xlab("") +
ylab("IL-6 Counts") +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 32, family = "Arial"),
plot.title = element_text(hjust = 0.5))
sasha_genes = c("CCL20", "CCL24", "CCL3", "CCL8", "CCR2", "CCL2", "IL1A", "IL1B", "AREG",
"CXCR4", "TNFSF14", "CXCL3", "CXCL5", "CXCL1", "CXCL8", "ATF4", "CD164",
"MFGE8", "IL13RA1", "IL1RL1", "IL18R1", "VSIG4", "VEGFA", "DDIT4", "BAG3",
"DNAJB1", "HSP90AA6P", "HSPA1B", "HSPA1A")
sasha_genes = subset(gene_conv, gene_name %in% sasha_genes)
sasha_counts = mclapply(sasha_genes$ensembl_gene_id, function(x){
counts = plotCounts(mp_des,
gene = x,
intgroup = "Diagnosis",
returnData = T,
normalized = T,
pc = T)
counts$gene = x
return(counts)})
sasha_counts = bind_rows(sasha_counts) %>%
left_join(., sasha_genes, by = c("gene" = "ensembl_gene_id"))
sasha_gene_plot = ggplot(sasha_counts, aes(x = Diagnosis, y = count, fill = Diagnosis)) +
geom_boxplot(width = 0.5, outlier.shape = NA) +
geom_jitter(size = 0.1, width = 0.25) +
scale_y_continuous(trans = "log10") +
scale_fill_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non_Pneumonia_Control" = fig2_pal[2],
"COVID_19" = fig2_pal[1],
"Other_Viral_Pneumonia" = fig2_pal[3],
"Other_Pneumonia" = fig2_pal[4])) +
scale_x_discrete(labels = c("Healthy_Control" = "Healthy\nControl",
"Non_Pneumonia_Control" = "Non-Pneumonia\nControl",
"COVID_19" = "COVID-19",
"Other_Viral_Pneumonia" = "Other Viral\nPneumonia",
"Other_Pneumonia" = "Other\nPneumonia")) +
xlab("") +
ylab("Gene Counts") +
theme_bw() +
theme(legend.position = "none") +
facet_wrap(~gene_name)
sasha_gene_plot
detection_rate = function(x)
{
return(sum(x > 0) / length(x))
}
all_counts = counts(mp_des, normalized = T)
#get 500 most variable genes with > 10% detection
rv = rowVars(all_counts)
prop_detected = apply(X = all_counts, MARGIN = 1, FUN = detection_rate)
selection_criteria = data.frame(sample = rownames(all_counts), var = rv, prop_detected = prop_detected) %>%
filter(prop_detected >= 0.1 & var > 0) %>%
arrange(desc(var))
top_genes = as.character(selection_criteria$sample[1:1000])
top_all_counts = all_counts[top_genes, ]
annotation = colData(mp_des) %>%
as.data.frame() %>%
dplyr::select(covid_confirmed,
sars_cov2_detected,
sex,
#day_of_intubation,
infection_type,
percent_neutrophils,
percent_total_CD206_high)
gene_annotation = gene_conv %>%
dplyr::filter(ensembl_gene_id %in% rownames(top_all_counts)) %>%
column_to_rownames(var = "ensembl_gene_id")
pheatmap(top_all_counts,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_col = annotation,
labels_row = gene_annotation$gene_name,
scale = "row",
show_rownames = T,
fontsize_row = 2,
show_colnames = F,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
angle_col = 45)
day0_samples = which(mp$day_of_intubation <= 2)
day0 = mp_des[, day0_samples]
day0_counts = counts(day0, normalized = T)
#get 1000 most variable genes
rv = rowVars(day0_counts)
prop_detected = apply(X = day0_counts, MARGIN = 1, FUN = detection_rate)
selection_criteria = data.frame(sample = rownames(day0_counts), var = rv, prop_detected = prop_detected) %>%
filter(prop_detected >= 0.1 & var > 0) %>%
arrange(desc(var))
top_genes = as.character(selection_criteria$sample[1:1000])
top_day0_counts = day0_counts[top_genes, ]
annotation = colData(day0) %>%
as.data.frame() %>%
dplyr::select(covid_confirmed,
sars_cov2_detected,
gender,
infection_type,
percent_neutrophils,
percent_total_CD206_high)
gene_annotation = gene_conv %>%
dplyr::filter(ensembl_gene_id %in% rownames(top_day0_counts)) %>%
column_to_rownames(var = "ensembl_gene_id")
pheatmap(top_day0_counts,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_col = annotation,
labels_row = gene_annotation$gene_name,
scale = "row",
show_rownames = T,
show_colnames = F,
fontsize_row = 2,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
angle_col = 45)
pathogen_set = mp_des[, !is.na(mp$pathogen)]
pathogen_set$pathogen = factor(as.character(pathogen_set$pathogen)) #refactor
pathogen_kmeans = k_means_figure(dge = pathogen_set,
design = "~pathogen",
k = 6,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("pathogen", "covid_confirmed"),
breaks = seq(-3, 3, length.out=101),
cores = 12)
Probably too granular
infection_type_kmeans = k_means_figure(dge = pathogen_set,
design = "~infection_type",
k = 4,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("pathogen", "infection_type"),
breaks = seq(-3, 3, length.out=101),
cores = 12)
ace2_counts = plotCounts(mp_des,
gene = "ENSG00000130234",
intgroup = "covid_confirmed",
returnData = T,
normalized = T,
pc = F)
ggplot(ace2_counts, aes(x = count)) +
geom_histogram(fill = "dodgerblue4", ) +
ylab("Number of Samples") +
xlab("ACE2 Count")
We will use PCA as is recommended to make computation feasible.
#select genes with > 10% detection, mean counts > 5
all_counts = t(counts(mp_des, normalized = T)) #need size normalization
prop_detected = apply(X = all_counts, MARGIN = 2, FUN = detection_rate)
all_counts = all_counts[, prop_detected > 0.1]
mean_detection = colMeans(all_counts)
all_counts = all_counts[, mean_detection >= 5]
all_pca = prcomp(all_counts)
fviz_eig(all_pca,
barfill = "dodgerblue4",
xlab = "Principal Component",
ylab = "Percent Variance Explained",
main = "",
ncp = 50,
ggtheme = theme_bw(),
addlabels = T) #20 should capture almost everything
total_umap = umap(X = all_counts,
pca = 20,
pca_center = T,
n_threads = 1,
scale = "Z",
min_dist = 0.2)
rownames(total_umap) = rownames(all_counts)
colnames(total_umap) = c("UMAP_1", "UMAP_2")
total_umap = as.data.frame(total_umap)
total_umap$sample = rownames(total_umap)
total_umap = left_join(total_umap, metadata, by = "sample") %>%
arrange(study_id, day_of_intubation) # so we can draw paths
ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, color = Diagnosis)) +
geom_point() +
stat_ellipse()
ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, shape = study_id, color = day_of_intubation)) +
geom_path(arrow = grid::arrow()) +
scale_color_viridis() +
facet_wrap(~ Diagnosis)
Patients actually separate out really well here. This might be worth including as a supplement. With that said, probably better to just use COVID patients for “trajectories”. It looks like there may be some structure to the COVID data.
#remove non-detected genes
#note that PCA should only use genes detected in all samples (3081)
COVID_counts = t(counts(mp_des[, mp_des$Diagnosis == "COVID_19"], normalized = T)) #need size normalization
prop_detected = apply(X = COVID_counts, MARGIN = 2, FUN = detection_rate)
COVID_counts = COVID_counts[, prop_detected > 0.1]
mean_detection = colMeans(COVID_counts)
COVID_counts = COVID_counts[, mean_detection >= 5]
COVID_pca = prcomp(COVID_counts)
fviz_eig(COVID_pca,
barfill = "dodgerblue4",
xlab = "Principal Component",
ylab = "Percent Variance Explained",
main = "",
ncp = 50,
ggtheme = theme_bw(),
addlabels = T) #20 should capture almost everything
COVID_umap = umap(X = COVID_counts,
pca = 20,
pca_center = T,
n_threads = 1,
scale = "Z",
min_dist = 0.4)
rownames(COVID_umap) = rownames(COVID_counts)
colnames(COVID_umap) = c("UMAP_1", "UMAP_2")
COVID_umap = as.data.frame(COVID_umap)
COVID_umap$sample = rownames(COVID_umap)
COVID_umap = left_join(COVID_umap, metadata, by = "sample") %>%
arrange(study_id, day_of_intubation) # so we can draw paths
ggplot(COVID_umap, aes(x = UMAP_1, y = UMAP_2, color = day_of_intubation)) +
scale_color_viridis() +
geom_point()
ggplot(COVID_umap, aes(x = UMAP_1, y = UMAP_2, shape = study_id, color = day_of_intubation)) +
scale_color_viridis() +
geom_path(arrow = grid::arrow())
Not so much structure it seems.
What gene modules are associated with covid? Which change substantially over time on ventilator?
## Thesholding
#setup
enableWGCNAThreads(nThreads = 12)
Allowing parallel execution with up to 12 working processes.
WGCNAnThreads()
[1] 12
# Choose a set of soft-thresholding powers
powers = c(1:20)
# Call the network topology analysis functions
sft = pickSoftThreshold(all_counts, powerVector = powers, verbose = 5, networkType = "signed")
pickSoftThreshold: will use block size 3395.
pickSoftThreshold: calculating connectivity for given powers...
..working on genes 1 through 3395 of 13176
..working on genes 3396 through 6790 of 13176
..working on genes 6791 through 10185 of 13176
..working on genes 10186 through 13176 of 13176
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 = 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",
type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,
cex=cex1,
col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1],
sft$fitIndices[,5],
xlab="Soft Threshold (power)",
ylab="Mean Connectivity",
type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
7 looks like the intersection after adding healthy controls
softPower = 7
adjacency = adjacency(all_counts, power = softPower, type = "signed")
# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency, TOMType = "signed")
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
geneTree = hclust(as.dist(dissTOM), method = "average")
minModuleSize = 30 # may need to adjust
# Make modules based on clusters
dynamicMods = cutreeDynamic(dendro = geneTree,
distM = dissTOM,
deepSplit = 2,
pamRespectsDendro = FALSE,
minClusterSize = minModuleSize)
..cutHeight not given, setting it to 0.989 ===> 99% of the (truncated) height range in dendro.
..done.
table(dynamicMods)
dynamicMods
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2734 2473 2350 1019 830 711 697 647 370 293 273 271 246 143 119
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
dynamicColors
black blue brown cyan green greenyellow magenta midnightblue pink
697 2473 2350 143 830 273 370 119 647
purple red salmon tan turquoise yellow
293 711 246 271 2734 1019
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree,
dynamicColors,
"Dynamic Tree Cut",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05,
main = "Gene dendrogram and module colors")
Looks like we captured the structure pretty well.
# Calculate eigengenes
MEList = moduleEigengenes(all_counts,
colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs)
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss),
method = "average")
# Plot the result
sizeGrWindow(7, 6)
plot(METree,
main = "Clustering of module eigengenes",
xlab = "",
sub = "")
#Now cut at height of 0.5 to get correlation of 0.5
#standard is 0.25, but we have some really similar modules that are not relevant
MEDissThres = 0.25
# Plot the cut line into the dendrogram
abline(h=MEDissThres,
col = "red")
# Call an automatic merging function
merge = mergeCloseModules(all_counts,
dynamicColors,
cutHeight = MEDissThres,
verbose = 3)
mergeCloseModules: Merging modules whose distance is less than 0.25
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 15 module eigengenes in given set.
Calculating new MEs...
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 15 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs
#replot
sizeGrWindow(12, 9)
plotDendroAndColors(geneTree,
cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05)
No change
# Define numbers of genes and samples
nGenes = ncol(all_counts)
nSamples = nrow(all_counts)
metadata = as.data.frame(colData(mp_des))
md_of_interest = metadata %>%
mutate(deceased = ifelse(binned_outcome == "Deceased",
yes = 1,
no = 0)) %>%
mutate(has_covid = ifelse(covid_confirmed == "TRUE",
yes = 1,
no = 0)) %>%
mutate(cov2_detected = ifelse(sars_cov2_detected == TRUE,
yes = 1,
no = 0)) %>%
mutate(has_pneumonia = ifelse(pna_type == "Non-Pneumonia Control",
yes = 0,
no = 1)) %>%
mutate(coinfection_detected = ifelse(any_nonviral == "TRUE",
yes = 1,
no = 0)) %>%
mutate(other_viral_pna = ifelse(pna_type == "Other Viral Pneumonia",
yes = 1,
no = 0)) %>%
mutate(nonviral_pna = ifelse(pna_type == "Other Pneumonia",
yes = 1,
no = 0)) %>%
mutate(is_healthy_control = ifelse(pna_type == "Healthy Control",
yes = 1,
no = 0)) %>%
dplyr::select(deceased, has_covid, cov2_detected, has_pneumonia,
other_viral_pna, nonviral_pna, is_healthy_control,
percent_neutrophils, percent_total_CD206_high,
percent_CD4_total, percent_CD8_total, finite_day_of_intubation,
C_Reactive_Protein, D_DIMER, PROCALCITONIN, mean_aps)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(all_counts, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = bicor(MEs,
md_of_interest,
use = "pairwise.complete.obs",
maxPOutliers = 0.05)
bicor: zero MAD in variable 'y'. Pearson correlation was used for individual columns with zero (or missing) MAD.
moduleTraitPvalue = corPvalueStudent(moduleTraitCor,
nSamples) %>%
as.numeric() %>%
p.adjust(., method = "fdr") %>%
matrix(ncol = ncol(moduleTraitCor))
colnames(moduleTraitPvalue) = colnames(moduleTraitCor)
rownames(moduleTraitPvalue) = rownames(moduleTraitCor)
moduleTraitCor_hm = t(moduleTraitCor)
#just make significant or not
pvals_hm = t(moduleTraitPvalue) %>%
as.matrix()
pvals_hm[pvals_hm < 0.05] = ""
pvals_hm[pvals_hm != ""] = "NS"
labs = c("Deceased", "COVID-19", "SARS-CoV-2 Detected", "Any Pneumonia", "Other Viral Pneumonia", "Other Pneumonia",
"Healthy Control", "% Neutrophils", "% Macrophages CD206-high", "% CD4 T", "% CD8 T",
"Day of Intubation", "CRP", "D-Dimer", "Procalcitonin", "Mean APS")
module_cor_heatmap = pheatmap(mat = moduleTraitCor_hm,
labels_row = labs,
labels_col = paste("Module", c(1:ncol(MEs))),
display_numbers = pvals_hm,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(100),
angle_col = 315)
module_cor_heatmap = pheatmap(mat = moduleTraitCor_hm,
labels_row = labs,
labels_col = paste("Module", c(1:ncol(MEs))),
display_numbers = pvals_hm,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(100),
angle_col = 315,
fontsize = 32)
Module 12 (turquoise) looks like the key MoAM cluster, 4 (blue) is key TRAM, and module 14 (purple) is likely our IFN cluster.
# Define numbers of genes and samples
nGenes = ncol(all_counts)
nSamples = nrow(all_counts)
md_of_interest = metadata %>%
mutate(deceased = ifelse(binned_outcome == "Deceased",
yes = 1,
no = 0)) %>%
mutate(has_covid = ifelse(covid_confirmed == "TRUE",
yes = 1,
no = 0)) %>%
mutate(cov2_detected = ifelse(sars_cov2_detected == TRUE,
yes = 1,
no = 0)) %>%
mutate(has_pneumonia = ifelse(pna_type == "Non-Pneumonia Control",
yes = 0,
no = 1)) %>%
mutate(coinfection_detected = ifelse(any_nonviral == "TRUE",
yes = 1,
no = 0)) %>%
mutate(other_viral_pna = ifelse(pna_type == "Other Viral Pneumonia",
yes = 1,
no = 0)) %>%
mutate(nonviral_pna = ifelse(pna_type == "Other Pneumonia",
yes = 1,
no = 0)) %>%
mutate(is_healthy_control = ifelse(pna_type == "Healthy Control",
yes = 1,
no = 0)) %>%
dplyr::select(deceased, has_covid, cov2_detected, has_pneumonia,
other_viral_pna, nonviral_pna, is_healthy_control,
percent_neutrophils, percent_total_CD206_high,
percent_CD4_total, percent_CD8_total, finite_day_of_intubation,
C_Reactive_Protein, D_DIMER, PROCALCITONIN, mean_aps,
Static_Compliance, PF_ratio)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(all_counts, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = bicor(MEs,
md_of_interest,
use = "pairwise.complete.obs",
maxPOutliers = 0.05)
bicor: zero MAD in variable 'y'. Pearson correlation was used for individual columns with zero (or missing) MAD.
moduleTraitPvalue = corPvalueStudent(moduleTraitCor,
nSamples) %>%
as.numeric() %>%
p.adjust(., method = "fdr") %>%
matrix(ncol = ncol(moduleTraitCor))
colnames(moduleTraitPvalue) = colnames(moduleTraitCor)
rownames(moduleTraitPvalue) = rownames(moduleTraitCor)
moduleTraitCor_hm = t(moduleTraitCor)
#just make significant or not
pvals_hm = t(moduleTraitPvalue) %>%
as.matrix()
pvals_hm[pvals_hm < 0.05] = ""
pvals_hm[pvals_hm != ""] = "NS"
labs = c("Deceased", "COVID-19", "SARS-CoV-2 Detected", "Any Pneumonia", "Other Viral Pneumonia", "Other Pneumonia",
"Healthy Control", "% Neutrophils", "% Macrophages CD206-high", "% CD4 T", "% CD8 T",
"Day of Intubation", "CRP", "D-Dimer", "Procalcitonin", "Mean APS",
"Static Compliance", "P/F Ratio")
module_cor_heatmap_vent_mini = pheatmap(mat = moduleTraitCor_hm,
labels_row = labs,
labels_col = paste("Module", c(1:ncol(MEs))),
display_numbers = pvals_hm,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(100),
angle_col = 315)
module_cor_heatmap_vent = pheatmap(mat = moduleTraitCor_hm,
labels_row = labs,
labels_col = paste("Module", c(1:ncol(MEs))),
display_numbers = pvals_hm,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(100),
angle_col = 315,
fontsize = 24)
saveRDS(module_cor_heatmap_vent, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200803_WGCNA_cor_heatmap.rds")
Module 13 (salmon) may also be interesting. Correlates with COVID, CRP, lymphocytes, and vent params!
module_assignments = data.frame(ensembl_gene_id = colnames(all_counts),
module = factor(mergedColors)) %>%
left_join(., gene_conv)
Joining, by = "ensembl_gene_id"
write.csv(module_assignments,
"/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_WGCNA_module_assignments.csv")
cd206_correlated = subset(module_assignments, module == "blue")
cd206_anticorrelated = subset(module_assignments, module == "turquoise")
cd206_correlated
cd206_anticorrelated
Pretty ideal, yeah
Purple contains all of the CoV-2 genes!
covid_correlated_cd206_uncorrelated = subset(module_assignments, module == "purple")
covid_correlated_cd206_uncorrelated
write.csv(covid_correlated_cd206_uncorrelated,
"/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_purple_module_14_genes.csv")
Pretty classic interferon response. Look at the GO.
#from k_means_figure
complete_counts = counts(mp_des, normalized = T)
universe = rownames(complete_counts[rowSums(complete_counts) > 0, ])
fisherTest = new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
cluster_genes = covid_correlated_cd206_uncorrelated$ensembl_gene_id
selection = as.numeric(universe %in% cluster_genes)
names(selection) = universe
go_data = new("topGOdata",
ontology = "BP",
allGenes = selection,
geneSel = function(x){
return(x == 1)},
annot = annFUN.org,
mapping = "org.Hs.eg.db",
ID = "ensembl")
Building most specific GOs .....
( 12142 GO terms found. )
Build GO DAG topology ..........
( 16096 GO terms and 38209 relations. )
Annotating nodes ...............
( 16536 genes annotated to the GO terms. )
#run Fisher test
test_results = getSigGroups(go_data, fisherTest)
-- Classic Algorithm --
the algorithm is scoring 3412 nontrivial nodes
parameters:
test statistic: Fisher test
module14_score = as.data.frame(score(test_results))
colnames(module14_score) = "pval"
module14_score = rownames_to_column(module14_score, var = "go_id")
#adjust p-values and take significant
module14_score$padj = p.adjust(module14_score$pval, method = "fdr")
module14_score = subset(module14_score, padj < 0.05)
module14_score$description = NA
for(i in 1:nrow(module14_score))
{
module14_score$description[i] = GOTERM[[module14_score$go_id[i]]]@Term
}
#add descriptions
module14_score$full_go = paste(module14_score$go_id, module14_score$description)
write.csv(module14_score,
"/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_purple_module_14_GO.csv")
module14_score
Pretty much all Type I interferon related. Most of the paper is here.
This module has IRF1 and the MHC-I genes
module13 = subset(module_assignments, module == "salmon")
module13
write.csv(module13,
"/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_salmon_module_13_genes.csv")
#from k_means_figure
cluster_genes = module13$ensembl_gene_id
selection = as.numeric(universe %in% cluster_genes)
names(selection) = universe
go_data = new("topGOdata",
ontology = "BP",
allGenes = selection,
geneSel = function(x){
return(x == 1)},
annot = annFUN.org,
mapping = "org.Hs.eg.db",
ID = "ensembl")
Building most specific GOs .....
( 12142 GO terms found. )
Build GO DAG topology ..........
( 16096 GO terms and 38209 relations. )
Annotating nodes ...............
( 16536 genes annotated to the GO terms. )
#run Fisher test
test_results = getSigGroups(go_data, fisherTest)
-- Classic Algorithm --
the algorithm is scoring 2785 nontrivial nodes
parameters:
test statistic: Fisher test
module13_score = as.data.frame(score(test_results))
colnames(module13_score) = "pval"
module13_score = rownames_to_column(module13_score, var = "go_id")
#adjust p-values and take significant
module13_score$padj = p.adjust(module13_score$pval, method = "fdr")
module13_score = subset(module13_score, padj < 0.05)
# module13_score$description = NA
# for(i in 1:nrow(module13_score))
# {
# module13_score$description[i] = GOTERM[[module13_score$go_id[i]]]@Term
# }
#
# #add descriptions
# module13_score$full_go = paste(module13_score$go_id, module13_score$description)
# write.csv(module13_score,
# "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_salmon_module_13_GO.csv")
# module13_score
No significant hits.
ccl24_counts = rownames_to_column(ccl24_counts, var = "sample")
ccl24_counts = ccl24_counts[, 1:2]
colnames(ccl24_counts)[2] = "CCL24"
total_umap = left_join(total_umap, ccl24_counts)
Joining, by = "sample"
ccl24_umap = ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, size = CCL24, color = pna_type, shape = binned_outcome)) +
geom_point() +
theme_bw() +
theme(text = element_text(size = 16, family = "Arial"),
legend.text = element_text(size = 16, family = "Arial"),
legend.title = element_text(size = 16, family = "Arial")) +
scale_color_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non-Pneumonia Control" = fig2_pal[2],
"COVID-19" = fig2_pal[1],
"Other Viral Pneumonia" = fig2_pal[3],
"Other Pneumonia" = fig2_pal[4]),
na.value = "black") +
scale_shape_manual(name = "",
values = c("Deceased" = 13,
"Discharged" = 19,
"Inpatient Facility" = 18,
"Other" = 18),
na.value = 1) +
scale_size_continuous(name = "CCL24 Counts", trans = "log10", range = c(1, 10)) +
xlab("UMAP 1") +
ylab("UMAP 2")
ccl24_umap
This is really cool! This isolates most the the IFN-non-responsive COVID patients. Maybe this drives heterogeneity in celltype composition in the lung?
tcell_umap = ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, size = percent_CD3_total, color = pna_type, shape = binned_outcome)) +
geom_point() +
theme_bw() +
theme(text = element_text(size = 16, family = "Arial"),
legend.text = element_text(size = 16, family = "Arial"),
legend.title = element_text(size = 16, family = "Arial")) +
scale_color_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non-Pneumonia Control" = fig2_pal[2],
"COVID-19" = fig2_pal[1],
"Other Viral Pneumonia" = fig2_pal[3],
"Other Pneumonia" = fig2_pal[4]),
na.value = "black") +
scale_shape_manual(name = "",
values = c("Deceased" = 13,
"Discharged" = 19,
"Inpatient Facility" = 18,
"Other" = 18),
na.value = 1) +
scale_size_continuous(name = "Percent Lymphocytes", range = c(1, 10)) +
xlab("UMAP 1") +
ylab("UMAP 2")
tcell_umap
TRAM content
ggplot(total_umap, aes(x = day_of_intubation, y = AEblue)) +
geom_point() +
geom_smooth()
ggplot(total_umap, aes(x = day_of_intubation, y = AEturquoise)) +
geom_point() +
geom_smooth()
Not much of a correlation…maybe a flat line, as I guess would be expected with near-constant recruitment.
orf8 = cov2_genes$ensembl_gene_id[which(cov2_genes$gene_name == "ORF8")]
orf8_counts = plotCounts(mp_des,
gene = orf8,
intgroup = "pna_type",
returnData = T,
normalized = T,
pc = T)
orf8_samples = rownames(subset(orf8_counts, count >0))
ggplot(orf8_counts, aes(x = pna_type, y = count)) +
geom_boxplot() +
geom_jitter(aes(color = count > 0)) +
scale_y_log10() +
ylab("ORF8 Counts") +
xlab("")
Not a ton of detection but very clean.
mp_des$orf8_detected = mp_des$sample %in% orf8_samples
total_umap$orf8_detected = total_umap$sample %in% orf8_samples
ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, size = AEpurple, color = Diagnosis, shape = orf8_detected)) +
geom_point()
Pretty clear spatial correlation…but in the opposite direction?
ggplot(total_umap, aes(x = orf8_detected, y = AEpurple)) +
geom_boxplot() +
geom_jitter(aes(color = Diagnosis))
high_cov2 = colnames(cov2_counts)[colSums(cov2_counts) > 50]
total_umap$cov2_detected = total_umap$sample %in% high_cov2
ggplot(total_umap, aes(x = cov2_detected, y = AEpurple)) +
geom_boxplot() +
geom_jitter(aes(color = Diagnosis))
This really just correlates with overall viral load (which correlates, in turn, with ORF8 levels). No support really for the ORF8 inhibition of the IFN response, at least at this stage.
human_mart = useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37)
#now pull down genes based on gene ontology (GO) definitions
typeI_IFN_response = getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
filters = "go",
values = "GO:0060337",
mart = human_mart)
`select_()` is deprecated as of dplyr 0.7.0.
Please use `select()` instead.
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39mCache found
IFNg_response = getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
filters = "go",
values = "GO:0060333",
mart = human_mart)
Cache found
covid_umap = covid_umap %>%
left_join(., type1_counts) %>%
left_join(., type2_counts) %>%
left_join(., ifn_counts)
Joining, by = "sample"
Joining, by = "sample"
Joining, by = "sample"
typei_cor = cor.test(covid_umap$typeI_IFN_response, covid_umap$finite_day_of_intubation)
typeii_cor = cor.test(covid_umap$IFNg_response, covid_umap$finite_day_of_intubation)
general_cor = cor.test(covid_umap$general_IFN_response, covid_umap$finite_day_of_intubation)
Let’s focus on interferon-hi vs interferon-low samples
### Identify interesting
last_batch = c(300312389, 304012699, 304017553, 304020298, 304020362,
340224808, 340233896, 340239789, 340239790, 340239805)
good_samples = mp_des[, ((!is.na(mp_des$RIN) & mp_des$RIN >= 7) &
mp_des$STAR_mqc_generalstats_star_uniquely_mapped_percent >= 30 &
mp_des$featureCounts_mqc_generalstats_featurecounts_percent_assigned >= 30) &
mp_des$RNA_concentration_pg_ul >= 125]
remaining_interesting = setdiff(good_samples$sample, last_batch)
interesting_metadata = subset(total_umap, sample %in% remaining_interesting)
table(interesting_metadata$pna_type)
ggplot(interesting_metadata, aes(x = pna_type, y = AEpurple)) +
geom_boxplot()
selection = interesting_metadata %>%
filter(pna_type != "Other Pneumonia" | sample == 340235110) %>%
dplyr::select(sample, tube_loc_macrophRNA_boxID, RNA_concentration_pg_ul) %>%
mutate(dilution = ifelse((250 / RNA_concentration_pg_ul) < 0.4,
yes = round(1 / (250 / RNA_concentration_pg_ul) / 5) * 5,
no = 1)) %>%
mutate(vol_for_250_pg = round(250 / RNA_concentration_pg_ul * dilution, digits = 2))
write.csv(selection, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200713_pico_opt_batch2.csv")
selection
final_counts_raw = counts(mp_des, normalized = F)
#clean metadata for safe sharing
final_md_safe = colData(mp_des) %>%
as.data.frame() %>%
dplyr::select(sample, age, sex, diagnosis = pna_type,
day_of_ventilation = finite_day_of_intubation, study_id)
#convert script IDs to new identifiers
id_conv = data.frame(script_id = sample(unique(final_md_safe$study_id))) %>% #shuffle IDs first
mutate(anonymized_patient_id = sprintf("%04d", 1:nrow(id_conv)))
#add back to md
final_md_safe = left_join(final_md_safe, id_conv, by = c("study_id" = "script_id")) %>%
dplyr::select(-study_id)
write.csv(final_counts_raw, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/GEO_output/200724_raw_counts_geo.csv")
write.csv(final_md_safe, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/GEO_output/200724_deidentified_metadata_geo.csv")
all_comparisons = combn(levels(mp_des$Diagnosis), 2, simplify = F)
names(all_comparisons) = vapply(all_comparisons, function(x){
return(paste0(x[1], " vs ", x[2]))}, "char")
dea_hits = lapply(all_comparisons, function(x){
hits = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", x[1], x[2]),
alpha = 0.05,
parallel = T)) %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
arrange(padj)
return(hits)})
names(dea_hits) = names(all_comparisons)
saveRDS(dea_hits, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/GEO_output/200728_bulk_dea_complete.rds")
output_genes = cov2_kmeans_clustered$genes %>%
dplyr::select(ensembl_gene_id = gene, cluster, external_gene_name) %>%
arrange(cluster)
names(dea_hits) = gsub("Healthy_Control", "HC", names(dea_hits))
names(dea_hits) = gsub("Other_Viral_Pneumonia", "OVP", names(dea_hits))
names(dea_hits) = gsub("Other_Pneumonia", "OP", names(dea_hits))
names(dea_hits) = gsub("Non_Pneumonia_Control", "NPC", names(dea_hits))
names(dea_hits) = gsub("COVID_19", "COVID-19", names(dea_hits))
all_sheets = c(list("k-means genes" = output_genes,
"k-means GO enrichment" = cov2_kmeans_clustered$go_df),
dea_hits)
write.xlsx(x = all_sheets,
file = "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200728_bulk_supplmental_data.xlsx")
fig3_heatmap = cov2_kmeans_clustered$plot
fig3_heatmap_gg = as.ggplot(fig3_heatmap)
viral_expression_heatmap_gg = as.ggplot(viral_expression_heatmap)
module_cor_heatmap_vent_gg = as.ggplot(module_cor_heatmap_vent)
Error in as.grob(plot) : object 'module_cor_heatmap_vent' not found
Marker genes
marker_gene_plot = ggplot(s3_counts, aes(x = Diagnosis, y = count, fill = Diagnosis)) +
geom_boxplot(width = 0.5, outlier.shape = NA) +
geom_jitter(width = 0.25) +
scale_y_continuous(trans = "log10") +
scale_fill_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non_Pneumonia_Control" = fig2_pal[2],
"COVID_19" = fig2_pal[1],
"Other_Viral_Pneumonia" = fig2_pal[3],
"Other_Pneumonia" = fig2_pal[4])) +
scale_x_discrete(labels = c("Healthy_Control" = "Healthy\nControl",
"Non_Pneumonia_Control" = "Non-Pneumonia\nControl",
"COVID_19" = "COVID-19",
"Other_Viral_Pneumonia" = "Other Viral\nPneumonia",
"Other_Pneumonia" = "Other\nPneumonia")) +
xlab("") +
ylab("Gene Counts") +
facet_wrap(~gene_name, scale = "free_y", ncol = 3) +
geom_signif(inherit.aes = F,
data = gene_comps,
aes(xmin = numerator, xmax = denominator, annotations = label, y_position = y_log10),
tip_length = 0,
manual=TRUE,
family = "Arial",
textsize = 4) +
theme_bw(base_family = "Arial", base_size = 32) +
theme(legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(hjust = 0.5),
strip.background = element_blank())
Ignoring unknown aesthetics: xmin, xmax, annotations, y_position
cov2_kmeans_noclustering = readRDS("/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_k_means_unclustered.rds")
coverage_plot = readRDS("/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200803_cov2_coverage_plot.rds")
figS3_heatmap = cov2_kmeans_noclustering$plot
figS3_heatmap_gg = as.ggplot(figS3_heatmap)
layout = "ABB\nABB\nABB\nABB\n#BB\nCBB\nDBB"
bulk_plot_fig3 = figS3_heatmap_gg +
marker_gene_plot +
sars_time_cor +
coverage_plot +
plot_layout(design = layout) +
plot_annotation(tag_levels = 'a') & theme(plot.tag = element_text(size = 48))
bulk_plot_fig3
1
[1] 1
md = as.data.frame(colData(mp_des))
col_types = unlist(lapply(md, class))
list_cols = names(col_types[col_types == "AsIs"])
safe_md = md %>%
dplyr::select(-c(all_of(list_cols))) %>%
as.data.frame()
write.csv(safe_md, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200730_bulk_mp_metadata.csv")